## The following packages will be needed for the analysis
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import plot_tree
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 5
from sklearn.model_selection import train_test_split #for train and test split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")
import nltk
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
from imblearn.over_sampling import SMOTE
from matplotlib import rcParams
rcParams['figure.figsize'] = 15, 5
from sklearn.model_selection import train_test_split #for train and test split
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, classification_report
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\fahad\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
df = pd.read_csv("Navigation_Dataset.csv")
df.head()
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | axg6v1l5DA7iV2mItNiXZXIt1NwbFFzV | 03aa8b78-4758-4fde-8260-10924007c86d | 7.413472e+09 | 11/6/21 | Account Management | DTV | App | Set up or manage Access ID/website profile (in... | 66.67 | No | 5 | Yes | 7 | 1 | 45d04218-444f-434f-a9de-9542ec0ae70d | 128.0 | 60.0 | 22.0 | 3.0 | , /acctmgmt/fpwd/createnewpwd, /my/virtual/alr... | 0.0 | 21.0 | 13.0 | 65.0 | 1.0 | 217.0 |
| 1 | AxMp11NgEd5gUQQk5tZppQ4C | 08ecdb46-47c4-4d40-bbbc-dbeee788e0f5 | 7.690689e+09 | 1/17/22 | Account Management | Wireless | Desktop | Other | 0.00 | No | 1 | Yes | 1 | 1 | tghbavyJYORybfbQh-ciD72cBdYayZqTjztRRR9zQBM | 39.0 | 11.0 | 14.0 | 9.0 | /acctmgmt/accountoverview, /acctmgmt/wireless/... | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | c5odUJVxxMVBldAkEhdEIw4C | 10d32120-8023-4720-8bdb-e63f57e99d65 | 6.756765e+09 | 12/12/21 | Account Management | Other | Desktop | Add, change or downgrade plans and features (e... | 0.00 | No | 1 | Yes | 1 | 0 | mTkboOqXd6RjBQoDu0-1ahoiMsjVCpF4heKBmxb9UBE | 19.0 | 3.0 | 3.0 | 5.0 | /support/, /support/contact-us/ | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | IMAyzf8pTFEiuRdoBZSPNnhnqEnzn5YP | 208b4dd7-a0fd-40c3-b44e-84ff3be1f46b | 7.570705e+09 | 1/14/22 | Account Management | BB | App | Check usage | 88.89 | No | 1 | Yes | 9 | 0 | 6ItGbcSYnePunpjfpKvx_CiPWNbInRq8nwj0O3shvPw | 49.0 | 13.0 | 7.0 | 15.0 | /virtual/nativemorenav, /acctmgmt/billandpay/h... | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | v7LSG4RWKj40pIUyRmIp3Z1BEb5FAo2k | 290bf194-6762-44bd-8aad-8027a4f358c1 | 7.138217e+09 | 1/18/22 | Account Management | Wireless | App | Make a payment | 22.22 | No | 8 | Yes | 3 | 0 | I0PFGM2HpNuk6fHTkrW_ayC7Qc76NM2fw7swz5j0U4k | 15.0 | 7.0 | 2.0 | 2.0 | /support/article/, /acctmgmt/makepayment/payme... | NaN | NaN | NaN | NaN | NaN | NaN |
## Discard values of each feature that are 2x standard deviation away from the mean values
for feature in df.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']):
df = df[np.abs(df[feature]-df[feature].mean()) <= (2*df[feature].std())]
df1=df['main_task'].str.split('/', expand=True)[:50]
df1
| 0 | 1 | 2 | |
|---|---|---|---|
| 8 | Make a payment | None | None |
| 25 | Update payment method | None | None |
| 62 | Other | None | None |
| 68 | View current bill | None | None |
| 75 | Enroll | unenroll in AutoPay | None |
| 79 | Request a wireless device trade in or check th... | None | None |
| 81 | Make a payment arrangement | None | None |
| 118 | Make a payment | None | None |
| 122 | Troubleshooting - fix a problem with my device... | None | None |
| 125 | View previous bill(s) | None | None |
| 145 | View current bill | None | None |
| 152 | Make a payment | None | None |
| 153 | Other | None | None |
| 157 | Troubleshooting - fix a problem with my device... | None | None |
| 169 | Plan, features or service help - review, chang... | None | None |
| 180 | View current bill | None | None |
| 183 | Troubleshooting - fix a problem with my device... | None | None |
| 196 | Make a payment | None | None |
| 240 | View current bill | None | None |
| 247 | Billing, payment and usage help - pay, review ... | None | None |
| 249 | Other | None | None |
| 307 | Purchase, upgrade, check eligibility | None | None |
| 308 | Check usage | None | None |
| 317 | Other | None | None |
| 356 | Other | None | None |
| 373 | View current bill | None | None |
| 396 | Troubleshooting - fix a problem with my device... | None | None |
| 404 | View current bill | None | None |
| 408 | Troubleshooting - fix a problem with my device... | None | None |
| 430 | Learn how to use products, services or equipme... | None | None |
| 447 | Make a payment | None | None |
| 460 | Set up | activate products or services | None |
| 464 | Plan, features or service help - review, chang... | None | None |
| 475 | Plan, features or service help - review, chang... | None | None |
| 482 | Other | None | None |
| 495 | Other | None | None |
| 514 | Set up or manage Access ID | website profile (includes linking | unlinking accounts) |
| 541 | Update payment method | None | None |
| 559 | Other | None | None |
| 568 | View previous bill(s) | None | None |
| 578 | Other | None | None |
| 581 | Make a payment | None | None |
| 594 | Other | None | None |
| 612 | Make a payment | None | None |
| 625 | Check usage | None | None |
| 629 | Contact AT&T Customer Service | None | None |
| 635 | Set up or manage email account (includes email... | None | None |
| 654 | Set up or manage Access ID | website profile (includes linking | unlinking accounts) |
| 663 | Other | None | None |
| 666 | Account assistance - register new account, get... | None | None |
df1=df['urls'].str.split(',', expand=True)[:50]
##df = pd.read_csv("/dbfs/FileStore/shared_uploads/fa3015@att.com/Navigation_Dataset.csv")
##df.head()
df2=df[df.Searches==1]
df2
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 62 | 4VZ9cYkNlwJ1ZsJR5NQFlA4C | d00c7072-7405-45ef-931d-10a05e5e4cc6 | 7.383279e+09 | 11/16/21 | Account Management | Wireless | Desktop | Other | 0.00 | No | 3 | Yes | 1 | 0 | J6oDR2EdRSrLWKKPdpk1OlyvBbDJPkNxho0yMP_WK28 | 39.0 | 10.0 | 18.0 | 6.0 | /acctmgmt/billandpay, /acctmgmt/billandpay/his... | 1.0 | 30.0 | 10.0 | 16.500000 | 1.0 | 425.0 |
| 68 | AYMospUAEsAcw8URYlJxIg4C | facadf1d-cfb0-42cd-85aa-0dc8b95019e0 | 7.611786e+09 | 2/15/22 | Account Management | BB | Desktop | View current bill | 48.15 | No | 2 | No | 3 | 0 | lQJTkB9owNWztujkvYBaRuEKdpzw6MvvtcK_kXcYTfo | 20.0 | 5.0 | 3.0 | 4.0 | /acctmgmt/accountoverview, /acctmgmt/billandpa... | 1.0 | 16.0 | 7.0 | 3.000000 | 2.0 | 511.0 |
| 625 | qNVhpw1EA0jnv2L792HC3E17Bhyg8tfo | d8ab5448-60c4-4c7d-9045-27a414f9b61c | 7.724436e+09 | 2/1/22 | Account Management | Wireless | Mobile Web | Check usage | 44.44 | Yes | 3 | Yes | 4 | 0 | fovtSFw2ZPH6qjAd2_bgsjzYfBxrOTP4eLXcAqe0omY | 22.0 | 6.0 | 6.0 | 4.0 | /acctmgmt/accountoverview, , /help/prepaid/, /... | 1.0 | 14.0 | 6.0 | 4.500000 | 0.0 | 139.0 |
| 937 | 4hQdgIMtJootQxFEpRd50w4C | 0e65c30e-88d3-4c7e-834a-71cdea355368 | 7.755826e+09 | 2/6/22 | Account Management | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Make a payment | 0.00 | No | 1 | Yes | 1 | 0 | 6a7b614e-fbdd-493c-8f11-3552986a1c25 | 56.0 | 23.0 | 11.0 | 4.0 | /acctmgmt/registration/welcome, /acctmgmt/acco... | 1.0 | 12.0 | 3.0 | 38.000000 | 0.0 | 466.0 |
| 1100 | F89cQ1QEkcVhdRA4lAMQFg4C | 9a2aae94-e7d0-4580-84c6-5aa6c5c7b977 | 7.055668e+09 | 2/3/22 | Account Management | Wireless | Desktop | Add, change or downgrade plans and features (e... | 81.48 | No | 8 | No | 8 | 0 | 459ca1d2-8787-4cb3-ae19-d1d99877a316 | 27.0 | 9.0 | 7.0 | 3.0 | /support/article/, /acctmgmt/wireless/device/o... | 1.0 | 20.0 | 11.0 | 6.000000 | 0.0 | 1882.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129062 | VBN4ZUEt9xd991UpsMwZVQ4C | 2ce2596e-8762-45b3-a4e7-d2e1571f41ad | 6.856674e+09 | 12/9/21 | Account Management | Wireless | Desktop | Request a wireless device trade in or check th... | 14.81 | No | 5 | Yes | 5 | 0 | Ziw5u6FN5RlLwTdM_rhd-NbLv8vbrdu0M89pE9mkX4Y | 23.0 | 6.0 | 7.0 | 3.0 | /acctmgmt/registration/welcome, /acctmgmt/regi... | 1.0 | 33.0 | 16.0 | 7.500000 | 0.0 | 1147.0 |
| 133261 | RdxhFQgY0JdppNx8c581Mg4C | 85336e4c-9512-431c-9da2-da33d6d2787d | 7.741791e+09 | 2/1/22 | Support | BB | Desktop | Find out how to move my service(s) to a new ad... | 0.00 | No | 1 | Yes | 2 | 0 | gguCr9R8AY57um8yJMC1Mm5_PpL44Re7VkCxWmWBKjw | 47.0 | 8.0 | 12.0 | 6.0 | /buy/broadband/check-availability/modal, /shop... | 1.0 | 35.0 | 15.0 | 8.000000 | 1.0 | 318.0 |
| 133993 | QBihOgjCDynWdJ4UykIQNX9gSv304HeZ | 4a291b4b-cff1-4f04-a445-9dc0c4b6c7b0 | 7.613172e+09 | 12/1/21 | Account Management | Wireless | Mobile Web | View current bill | 100.00 | Yes | 10 | No | 10 | 0 | v9HwIhSGqRbz6A-DE6eFF04E3kuFKtwZfMahkfWm6-o | 22.0 | 5.0 | 3.0 | 8.0 | , /features/myatt-app/, /, /my/virtual/viewbil... | 1.0 | 21.0 | 9.0 | 4.000000 | 2.0 | 416.0 |
| 134043 | lV8YlFBV1Rg5gMchVZAtNw4C | f59d92d5-15f8-4510-be23-6a47a63dfa7d | 7.615036e+09 | 12/23/21 | Support | DTV | Desktop | Contact AT&T Customer Service - find phone num... | 0.00 | No | 1 | No | 1 | 0 | 8rOWe7OuIx5QmKCZ1Q4Od-bOIx_tYd3RJpfGX0f2Cu0 | 50.0 | 14.0 | 10.0 | 13.0 | /help/cancellation-policy/, , /acctmgmt/accoun... | 1.0 | 5.0 | 2.0 | 15.500000 | 0.0 | 722.0 |
| 135124 | dNpVINcMtlgRIFsJYot9pw4C | df3da0a6-cff4-4943-af7c-79c3d3c7c241 | 6.846253e+09 | 11/4/21 | Account Management | Wireless | Desktop | Check order status | 0.00 | No | 1 | No | 1 | 0 | MldXfEft5673fopgmjVtooJG7yJ--RBHyztaQNBRMzU | 63.0 | 12.0 | 23.0 | 13.0 | /acctmgmt/wireless/device/optionsettings, /sup... | 1.0 | 19.0 | 12.0 | 27.916667 | 1.0 | 483.0 |
99 rows × 26 columns
#df.loc[['Support']]
df1=df[df['Survey_Category']== 'Account Management']
df1=df1.set_index('Survey_Category')
df['Search']=["Didn't use search" if x == 0.0 else 'Used Search' for x in df['Searches']]
df1 = df[df['Searches']== 0.0]
fig,axs=plt.subplots(ncols=2)
fig.set_size_inches(15,5)
sns.histplot(binwidth=0.5, x='Survey_Category', hue = 'TA', data=df1[df1['Survey_Category']== 'Support'], stat="density", multiple="dodge", shrink=.8, ax=axs[0])
sns.histplot(binwidth=0.5, x='Survey_Category', hue = 'Navigation_Difficulties', data=df1[df1['Survey_Category']== 'Support'].sort_values(by = "Navigation_Difficulties"), stat="density", multiple="dodge", shrink=.8, ax=axs[1])
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
axs[0].set_title('Figure 1: TA when searches were not used')
axs[1].set_title('Figure 1: Navigation_Difficulties when searches were not used')
plt.tight_layout()
plt.show()
df1 = df[df['Searches']== 1.0]
fig,axs=plt.subplots(ncols=2)
fig.set_size_inches(15,5)
sns.histplot(binwidth=0.5, x='Survey_Category', hue = 'TA', data=df1[df1['Survey_Category']== 'Support'], stat="density", multiple="dodge", shrink=.8, ax=axs[0])
sns.histplot(binwidth=0.5, x='Survey_Category', hue = 'Navigation_Difficulties', data=df1[df1['Survey_Category']== 'Support'], stat="density", multiple="dodge", shrink=.8, ax=axs[1])
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
axs[0].set_title('Figure 1: TA when searches were used')
axs[1].set_title('Figure 1: Navigation_Difficulties when searches were used')
plt.tight_layout()
plt.show()
from scipy.stats import chi2_contingency
# defining the table
data = [[207, 282, 241], [234, 242, 232]]
stat, p, dof, expected = chi2_contingency(data)
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('Dependent (reject H0)')
else:
print('Independent (H0 holds true)')
p value is 0.1031971404730939 Independent (H0 holds true)
rcParams['figure.figsize'] = 20, 8
sns.histplot(binwidth=0.5, y="Product", hue="TA", data=df.sort_values(by = 'TA', ascending = True), stat="density", multiple="dodge", shrink=.8)
plt.xlabel("Percentage");
df[['Time Spent per Visit (seconds)', 'Platform']].groupby("Platform").mean().sort_values(ascending = True, by = 'Time Spent per Visit (seconds)').plot(kind = 'barh')
<AxesSubplot:ylabel='Platform'>
plt.figure(figsize=(20, 14))
for e, column in enumerate(['Time Spent per Visit (seconds)', 'Searches', 'Average Page Depth', 'Page Events',
'event_count', 'page_loads', 'link_clicks']):
plt.subplot(4, 3, e + 1)
sns.barplot( data=df, x='Platform', y=column, palette="husl")
plt.figure(figsize=(20, 14))
for e, column in enumerate(['CSAT', 'Navigation_Difficulties']):
plt.subplot(4, 3, e + 1)
#sns.lineplot( data=df.reset_index(), x=column, y='TA', palette="husl", y = df.reset_index().loc[df.reset_index()['Survey_Category'] == "Support"]['Survey_Category'])
sns.histplot(binwidth=0.5, x=column , hue="Navigation_Difficulties", data = df.reset_index().loc[df.reset_index()['Survey_Category'] == "Support"], stat="density", multiple="dodge")
plt.ylabel("Percentage");
df[['Time Spent per Visit (seconds)', 'Platform']].groupby("Platform").mean().sort_values(ascending = True, by = 'Time Spent per Visit (seconds)').plot(kind = 'barh')
plt.title("Time spent on page against the platform");
df[['Searches', 'Platform']].groupby("Platform").mean().sort_values(ascending = True, by = 'Searches').plot(kind = 'barh')
plt.title("Searches against the platform");
rcParams['figure.figsize'] = 20, 8
sns.histplot(binwidth=0.5, y="Product", hue="Navigation_Difficulties", data=df.sort_values(by = 'TA', ascending = False), stat="density", multiple="dodge", shrink=.8)
plt.xlabel("Percentage");
sns.histplot(binwidth=0.5, y="Platform", hue="TA", data=df, stat="density", multiple="dodge", shrink=.8)
plt.xlabel("Percentage");
sns.histplot(binwidth=0.5, y="Platform", hue="Navigation_Difficulties", data=df, stat="density", multiple="dodge", shrink=.8)
plt.xlabel("Percentage");
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='Navigation_Difficulties',y='CSAT',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='CSAT'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='TA',y='CSAT',data=df)
<AxesSubplot:xlabel='TA', ylabel='CSAT'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='TA',y='Effort',data=df)
<AxesSubplot:xlabel='TA', ylabel='Effort'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='Navigation_Difficulties',y='Effort',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Effort'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='TA',y='Relevance',data=df)
<AxesSubplot:xlabel='TA', ylabel='Relevance'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='Navigation_Difficulties',y='Relevance',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Relevance'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='TA',y='Time Spent per Visit (seconds)',data=df)
<AxesSubplot:xlabel='TA', ylabel='Time Spent per Visit (seconds)'>
rcParams['figure.figsize'] = 15, 5
sns.barplot(x='Navigation_Difficulties',y='Time Spent per Visit (seconds)',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Time Spent per Visit (seconds)'>
selected_features = df[['Product', 'Platform', 'CSAT', 'TA',
'Effort', 'Navigation_Difficulties', 'Relevance', 'called_flag', 'event_count', 'page_loads', 'link_clicks',
'impressions', 'Searches', 'Page Events', 'Page Views',
'Average Page Depth', 'Reloads', 'Time Spent per Visit (seconds)']]
#The features with missing values
total = selected_features.isnull().sum().sort_values(ascending=False)
missing_data = pd.DataFrame(total, columns = ["Missing Values"])
missing_data
| Missing Values | |
|---|---|
| Product | 0 |
| Platform | 0 |
| Reloads | 0 |
| Average Page Depth | 0 |
| Page Views | 0 |
| Page Events | 0 |
| Searches | 0 |
| impressions | 0 |
| link_clicks | 0 |
| page_loads | 0 |
| event_count | 0 |
| called_flag | 0 |
| Relevance | 0 |
| Navigation_Difficulties | 0 |
| Effort | 0 |
| TA | 0 |
| CSAT | 0 |
| Time Spent per Visit (seconds) | 0 |
## features with missing values
df1 = df[['Average Page Depth', 'Time Spent per Visit (seconds)', 'Reloads',
'Page Views', 'Page Events', 'Searches', 'page_loads', 'impressions',
'link_clicks', 'event_count']]
for feature in df1:
mean = selected_features[feature].mean()
std = selected_features[feature].std()
is_null = selected_features[feature].isnull().sum()
# Get random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in each feature column with random values generated
feature_slice = selected_features[feature].copy()
feature_slice[np.isnan(feature_slice)] = rand_age
selected_features[feature] = feature_slice
##Check if any column still contain missing values
selected_features[feature].isnull().sum()
0
top8_products = df.Product.value_counts().sort_values(ascending = False)[:10]
top8_products
Wireless 4313 BB 1706 DTV 590 Other 539 Other services from AT&T (Home phone, DSL, etc.) 300 Other services from AT&T (Home phone, DSL,etc.) 93 WatchTV 57 ATT TV 41 DIRECTV or DIRECTV bundled with other AT&T services (TV delivered by satellite dish)_OLD 1 AT&T TVSM or AT&T TVSM bundled with other AT&T services (streamed programming through a special receiver from AT&T)_OLD 1 Name: Product, dtype: int64
top_15_tasks = df.main_task.value_counts().sort_values(ascending = False)[:15]
top_15_tasks
Make a payment 1580 Other 1002 View current bill 849 Troubleshooting - fix a problem with my device or service 637 Add, change or downgrade plans and features (e.g. International plans, data plans, TV packages, etc.) 338 Check usage 287 Update payment method 237 Set up or manage Access ID/website profile (includes linking/unlinking accounts) 232 View previous bill(s) 204 Billing, payment and usage help - pay, review or dispute charges 194 Make a payment arrangement 189 Set up or manage email account (includes email password resets) 188 Set up/activate products or services 162 Contact AT&T Customer Service 160 Contact AT&T Customer Service - find phone numbers, see chat options, locate AT&T stores 135 Name: main_task, dtype: int64
df = df.loc[df['main_task'].isin(list(top_15_tasks.index))]
df = df.loc[df['main_task'].isin(list(top8_products.index))]
selected_features = df[['TA','called_flag', 'event_count', 'page_loads','Searches',
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']]
total = selected_features.isnull().sum().sort_values(ascending=False)
missing_data = pd.DataFrame(total, columns = ["Missing Values"])
missing_data
| Missing Values | |
|---|---|
| TA | 0 |
| called_flag | 0 |
| event_count | 0 |
| page_loads | 0 |
| Searches | 0 |
| Page Events | 0 |
| Page Views | 0 |
| Average Page Depth | 0 |
| Reloads | 0 |
| Time Spent per Visit (seconds) | 0 |
selected_features.select_dtypes('object')
| TA | |
|---|---|
| 62 | No |
| 153 | No |
| 249 | No |
| 317 | Yes |
| 356 | No |
| ... | ... |
| 135446 | Yes |
| 135452 | No |
| 135822 | Yes |
| 135826 | Yes |
| 136012 | No |
1002 rows × 1 columns
for feature in selected_features.select_dtypes('object'):
print(feature)
selected_features[feature] = pd.factorize(selected_features[feature])[0]
TA
## features with missing values
df1 = df[['Average Page Depth', 'Time Spent per Visit (seconds)', 'Reloads',
'Page Views', 'Page Events', 'Searches', 'page_loads', 'event_count']]
for feature in df1:
mean = selected_features[feature].mean()
std = selected_features[feature].std()
is_null = selected_features[feature].isnull().sum()
# Get random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in each feature column with random values generated
feature_slice = selected_features[feature].copy()
feature_slice[np.isnan(feature_slice)] = rand_age
selected_features[feature] = feature_slice
##Check if any column still contain missing values
selected_features[feature].isnull().sum()
0
## Selecting 3 boolean features and two population features
X = selected_features.drop('TA', axis = 1)
y = selected_features['TA']
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X_train, y_train)
y_pred_tr = model.predict(X_train)
y_pred = model.predict(X_test)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
Accuracy score for training data 0.9990157480314961 Accuracy score for test data 0.6023622047244095
A Confusion matrix is used for evaluating the performance of a classification model. The matrix compares the actual target values with those predicted by the machine learning model.
It is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y_test, y_pred)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(conf,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
0 0.60 0.55 0.57 123
1 0.61 0.65 0.63 131
accuracy 0.60 254
macro avg 0.60 0.60 0.60 254
weighted avg 0.60 0.60 0.60 254
<AxesSubplot:>
### Set survey category as target variable
X = df[['Time Spent per Visit (seconds)']]
y = df['Navigation_Difficulties']
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X, y)
fig = plt.figure(figsize=(25, 20))
_ = tree.plot_tree(model,
feature_names=X.columns,
class_names=["1", "0"],
filled=True)
### Set survey category as target variable
X = df[['Time Spent per Visit (seconds)']]
y = df['TA']
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X, y)
fig = plt.figure(figsize=(10, 10))
_ = tree.plot_tree(model,
feature_names=X.columns,
class_names=["1", "0"],
filled=True)
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
0 0.60 0.55 0.57 123
1 0.61 0.65 0.63 131
accuracy 0.60 254
macro avg 0.60 0.60 0.60 254
weighted avg 0.60 0.60 0.60 254
<AxesSubplot:>
selected_features['TA'] = pd.factorize(selected_features['TA'])[0]
#Let's visualize correlations
plt.figure(figsize=(20,10))
cor = selected_features.corr()
heatmap = sns.heatmap(cor
, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlation with Task Accomplishment', fontdict={'fontsize':18}, pad=16);
#Correlation with output variable
cor_target = abs(cor['TA'])
#Selecting highly correlated features
relevant_features = cor_target.sort_values(ascending = False)
## Top 6 featurs with high correlation
relevant_features[:6]
TA 1.000000 event_count 0.068686 Page Views 0.043329 page_loads 0.031152 Searches 0.030907 Time Spent per Visit (seconds) 0.028977 Name: TA, dtype: float64
# Oversample with SMOTE and random undersample for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from matplotlib import pyplot
from numpy import where
# define dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
# summarize class distribution
counter = Counter(y)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
row_ix = where(y == label)[0]
pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()
Counter({0: 9900, 1: 100})
Counter({0: 1980, 1: 990})
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='Navigation_Difficulties',y='Time Spent per Visit (seconds)',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Time Spent per Visit (seconds)'>
df.groupby('Navigation_Difficulties')['Time Spent per Visit (seconds)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 503.585683 | 580.211663 | 0.0 | 79.0 | 258.0 | 761.0 | 2192.0 |
| Yes | 541.0 | 486.053604 | 529.292070 | 0.0 | 102.0 | 270.0 | 699.0 | 2236.0 |
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='TA',y='Time Spent per Visit (seconds)',data=df)
<AxesSubplot:xlabel='TA', ylabel='Time Spent per Visit (seconds)'>
df.groupby('TA')['Time Spent per Visit (seconds)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| TA | ||||||||
| No | 635.0 | 506.297638 | 549.749230 | 0.0 | 98.5 | 291.0 | 755.0 | 2236.0 |
| Yes | 367.0 | 473.049046 | 558.949197 | 0.0 | 75.0 | 244.0 | 668.5 | 2232.0 |
df.groupby('Navigation_Difficulties')['Average Page Depth'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 21.765291 | 13.471723 | 0.0 | 10.5 | 21.5 | 29.5 | 62.0 |
| Yes | 541.0 | 23.412598 | 13.014742 | 0.0 | 15.0 | 22.5 | 31.0 | 61.0 |
df.groupby('TA')['Average Page Depth'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| TA | ||||||||
| No | 635.0 | 22.801183 | 13.417547 | 0.0 | 13.50 | 22.0 | 30.50 | 62.0 |
| Yes | 367.0 | 22.401263 | 12.957487 | 0.0 | 12.75 | 22.5 | 29.75 | 61.0 |
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='Navigation_Difficulties',y='Average Page Depth',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Average Page Depth'>
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='TA',y='Average Page Depth',data=df)
<AxesSubplot:xlabel='TA', ylabel='Average Page Depth'>
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='Navigation_Difficulties',y='Page Events',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Page Events'>
df.groupby('Navigation_Difficulties')['Page Events'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 16.839479 | 14.746563 | 0.0 | 6.0 | 12.0 | 24.0 | 95.0 |
| Yes | 541.0 | 17.210721 | 15.548966 | 0.0 | 6.0 | 12.0 | 23.0 | 93.0 |
rcParams['figure.figsize'] = 15, 5
sns.lineplot(x='Navigation_Difficulties',y='Page Views',data=df)
<AxesSubplot:xlabel='Navigation_Difficulties', ylabel='Page Views'>
df.groupby('Navigation_Difficulties')['Page Views'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 7.255965 | 6.234511 | 1.0 | 2.0 | 5.0 | 10.0 | 35.0 |
| Yes | 541.0 | 7.314233 | 6.166604 | 1.0 | 3.0 | 6.0 | 10.0 | 34.0 |
df.Navigation_Difficulties.unique()
array(['Yes', 'No'], dtype=object)
df.head()
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | Search | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 62 | 4VZ9cYkNlwJ1ZsJR5NQFlA4C | d00c7072-7405-45ef-931d-10a05e5e4cc6 | 7.383279e+09 | 11/16/21 | Account Management | Wireless | Desktop | Other | 0.00 | No | 3 | Yes | 1 | 0 | J6oDR2EdRSrLWKKPdpk1OlyvBbDJPkNxho0yMP_WK28 | 39.0 | 10.0 | 18.0 | 6.0 | /acctmgmt/billandpay, /acctmgmt/billandpay/his... | 1.0 | 30.0 | 10.0 | 16.5 | 1.0 | 425.0 | Used Search |
| 153 | 5omvzyCjcTzmEwXCD5yscNg22J8aDmfZ | 4dbad6a1-a1ed-4911-8474-75e67f78280f | 7.527789e+09 | 1/27/22 | Support | Wireless | Mobile Web | Other | 0.00 | No | 5 | Yes | 1 | 0 | ACu_4RG3Yk7deevwvHNrvhjh8ZR9G7IfnarfrhyeTB8 | 17.0 | 4.0 | 1.0 | 2.0 | /support/category/report-call-or-text, /acctmg... | 0.0 | 41.0 | 14.0 | 7.5 | 1.0 | 1566.0 | Didn't use search |
| 249 | sxI9AUxIUQNEJAJwA8Y1Aw4C | 121692ad-f829-4097-a20e-b18cd3dc5285 | 7.252157e+09 | 11/23/21 | Account Management | Wireless | Desktop | Other | 0.00 | No | 1 | Yes | 2 | 0 | -6qB6ImMdHahNZqyA1MuzvfSyj5oU8x6O-lM0iwUEKI | 10.0 | 2.0 | 4.0 | 1.0 | /support/article/, /wireless/ | 10.0 | 13.0 | 5.0 | 2.4 | 0.0 | 416.0 | Used Search |
| 317 | FQBRAcdA1VJAVYsI5R1oow4C | d5476a67-cc32-45a7-afcf-d56309a0907f | 7.199562e+09 | 11/9/21 | Account Management | Wireless | Desktop | Other | 22.22 | Yes | 5 | Yes | 5 | 0 | 5m4wbPPM-Jk8kR2AwCHWLjXXUmO5i1OjgfPONWwN5vk | 79.0 | 26.0 | 23.0 | 6.0 | /acctmgmt/profile/overview, /my/virtual/update... | 0.0 | 10.0 | 6.0 | 34.5 | 1.0 | 109.0 | Didn't use search |
| 356 | YlPzPIrzy6swxVrgTYOUiOB3QB1qiBoE | 97b9b9ab-34f3-4ca0-a2e8-460d946d7ff2 | 7.496297e+09 | 2/22/22 | Account Management | Wireless | App | Other | 88.89 | No | 4 | No | 9 | 0 | m1oC9ZmkMG2JfXjH3UFEeOmM0qrNiw8J0uKr-Ktrbt8 | 21.0 | 7.0 | 5.0 | 4.0 | /acctmgmt/profile/overview, /acctmgmt/profile/... | 0.0 | 11.0 | 1.0 | 26.0 | 0.0 | 2.0 | Didn't use search |
plt.figure(figsize=(15, 5))
#for e, column in enumerate(['TA', 'Navigation_Difficulties']):
#plt.subplot(3, 2, e + 1)
g = sns.barplot(data=df, x="TA", y="CSAT")
g.set_xticklabels([ 'Use Search', 'Did not use search'],
rotation=20)
plt.ylabel('Frequency')
plt.title("Navigation Success");
sns.countplot(y="Platform", hue="Navigation_Difficulties", data=df)
plt.title("Navigation by Platform")
Text(0.5, 1.0, 'Navigation by Platform')
sns.countplot(y="Platform", hue="TA", data=df)
plt.title("Task Acccomplishment by Platform")
Text(0.5, 1.0, 'Task Acccomplishment by Platform')
rcParams['figure.figsize'] = 18, 6
sns.countplot(y="Product", hue="Navigation_Difficulties", data=df)
plt.title("Navigation Difficulties by Product")
Text(0.5, 1.0, 'Navigation Difficulties by Product')
rcParams['figure.figsize'] = 18, 6
sns.countplot(y="Product", hue="TA", data=df)
plt.title("Task accomplishment by Product")
Text(0.5, 1.0, 'Task accomplishment by Product')
df.columns
Index(['foresee_respondent', 'foresee_session_id', 'wt_visitor_id', 'day',
'Survey_Category', 'Product', 'Platform', 'main_task', 'CSAT', 'TA',
'Effort', 'Navigation_Difficulties', 'Relevance', 'called_flag',
'app_session_id', 'event_count', 'page_loads', 'link_clicks',
'impressions', 'urls', 'Searches', 'Page Events', 'Page Views',
'Average Page Depth', 'Reloads', 'Time Spent per Visit (seconds)',
'Search'],
dtype='object')
## Updating selected features
selected_features_2 = df[['called_flag', 'event_count', 'page_loads','Searches',
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']].dropna()
### Set survey category as target variable
X = selected_features_2.drop(['Reloads'], axis = 1)
y = selected_features_2['Reloads']
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
# summarize feature importance
imp = pd.DataFrame()
imp["var"] = X.columns
importance = model.feature_importances_
imp["imp"] = importance
imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)
# PRINT OUT THE DATA FRAME
imp.sort_values(by = 'imp', ascending = True).plot.barh(x='var', y='imp', ).figsize=(15, 5)
plt.title("Feature importance plot for Decision Tree Model (Survey Category)")
Accuracy score for training data 1.0 Accuracy score for test data 0.7370753323485968
Text(0.5, 1.0, 'Feature importance plot for Decision Tree Model (Survey Category)')
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
0.0 0.69 0.56 0.62 131
1.0 0.59 0.62 0.61 113
2.0 0.69 0.68 0.68 118
3.0 0.75 0.78 0.76 98
4.0 0.81 0.88 0.84 106
5.0 0.89 0.95 0.92 111
accuracy 0.74 677
macro avg 0.74 0.75 0.74 677
weighted avg 0.73 0.74 0.73 677
<AxesSubplot:>
df.groupby('Survey_Category')['page_loads'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Survey_Category | ||||||||
| Account Management | 699.0 | 11.125894 | 6.499606 | 0.0 | 6.0 | 10.0 | 16.0 | 29.0 |
| Support | 303.0 | 10.990099 | 6.097706 | 1.0 | 6.0 | 10.0 | 15.0 | 29.0 |
df.groupby('Survey_Category')['Time Spent per Visit (seconds)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Survey_Category | ||||||||
| Account Management | 699.0 | 490.067239 | 557.058690 | 0.0 | 86.0 | 260.0 | 699.0 | 2236.0 |
| Support | 303.0 | 503.468647 | 544.621867 | 0.0 | 89.0 | 275.0 | 789.0 | 2232.0 |
df.groupby('Survey_Category')['Page Events'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Survey_Category | ||||||||
| Account Management | 699.0 | 17.074392 | 15.247158 | 0.0 | 6.0 | 12.0 | 24.0 | 93.0 |
| Support | 303.0 | 16.960396 | 15.044253 | 0.0 | 6.5 | 12.0 | 22.0 | 95.0 |
df.groupby('Survey_Category')['Average Page Depth'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Survey_Category | ||||||||
| Account Management | 699.0 | 23.299994 | 13.186226 | 0.0 | 15.0 | 23.0 | 31.208333 | 61.0 |
| Support | 303.0 | 21.166070 | 13.284747 | 0.0 | 10.5 | 20.0 | 28.166667 | 62.0 |
df.groupby('Survey_Category')['Reloads'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Survey_Category | ||||||||
| Account Management | 699.0 | 0.808298 | 1.151996 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
| Support | 303.0 | 0.745875 | 1.109012 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
## Updating selected features
selected_features_3 = df[['called_flag', 'event_count', 'page_loads','Searches', "Product",
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']].dropna()
### Set Product as target variable
X = selected_features_3.drop(['Product'], axis = 1)
y = selected_features_3['Product']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
# summarize feature importance
imp = pd.DataFrame()
imp["var"] = X.columns
importance = model.feature_importances_
imp["imp"] = importance
imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)
# PRINT OUT THE DATA FRAME
imp.sort_values(by = 'imp', ascending = True).plot.barh(x='var', y='imp', ).figsize=(15, 5)
plt.title("Feature importance plot for Decision Tree Model (Product)")
Accuracy score for training data 0.9987515605493134 Accuracy score for test data 0.373134328358209
Text(0.5, 1.0, 'Feature importance plot for Decision Tree Model (Product)')
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
ATT TV 0.00 0.00 0.00 4
BB 0.10 0.16 0.12 32
DTV 0.16 0.21 0.18 14
Other 0.20 0.20 0.20 20
Other services from AT&T (Home phone, DSL, etc.) 0.00 0.00 0.00 11
Other services from AT&T (Home phone, DSL,etc.) 0.00 0.00 0.00 3
WatchTV 0.00 0.00 0.00 0
Wireless 0.66 0.54 0.59 117
accuracy 0.37 201
macro avg 0.14 0.14 0.14 201
weighted avg 0.43 0.37 0.40 201
<AxesSubplot:>
## Updating selected features
selected_features_4 = df[['called_flag', 'event_count', 'page_loads','Searches', "Platform",
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']].dropna()
### Set Platform as target variable
X = selected_features_4.drop(['Platform'], axis = 1)
y = selected_features_4['Platform']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.tree import DecisionTreeClassifier
# define the model
model = DecisionTreeClassifier()
# fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
# summarize feature importance
imp = pd.DataFrame()
imp["var"] = X.columns
importance = model.feature_importances_
imp["imp"] = importance
imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)
# PRINT OUT THE DATA FRAME
imp.sort_values(by = 'imp', ascending = True).plot.barh(x='var', y='imp', ).figsize=(15, 5)
plt.title("Feature importance plot for Decision Tree Model (Platform)")
Accuracy score for training data 1.0 Accuracy score for test data 0.572139303482587
Text(0.5, 1.0, 'Feature importance plot for Decision Tree Model (Platform)')
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
App 0.70 0.68 0.69 63
Desktop 0.61 0.60 0.61 98
Mobile Web 0.30 0.33 0.31 40
accuracy 0.57 201
macro avg 0.54 0.54 0.54 201
weighted avg 0.58 0.57 0.57 201
<AxesSubplot:>
df.groupby('Platform')['Time Spent per Visit (seconds)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Platform | ||||||||
| App | 295.0 | 280.969492 | 353.429682 | 0.0 | 56.5 | 145.0 | 344.00 | 2016.0 |
| Desktop | 510.0 | 615.986275 | 606.431672 | 0.0 | 127.0 | 405.0 | 943.75 | 2221.0 |
| Mobile Web | 197.0 | 497.812183 | 559.341103 | 0.0 | 89.0 | 262.0 | 737.00 | 2236.0 |
df.groupby('Platform')['page_loads'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Platform | ||||||||
| App | 295.0 | 10.728814 | 6.305786 | 2.0 | 5.0 | 10.0 | 15.0 | 29.0 |
| Desktop | 510.0 | 11.274510 | 6.426793 | 1.0 | 6.0 | 11.0 | 15.0 | 29.0 |
| Mobile Web | 197.0 | 11.126904 | 6.368700 | 0.0 | 6.0 | 11.0 | 15.0 | 29.0 |
df.groupby('Platform')['Page Events'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Platform | ||||||||
| App | 295.0 | 17.711864 | 16.727763 | 0.0 | 6.0 | 12.0 | 23.5 | 93.0 |
| Desktop | 510.0 | 17.656863 | 14.697002 | 0.0 | 7.0 | 13.0 | 25.0 | 95.0 |
| Mobile Web | 197.0 | 14.436548 | 13.692776 | 0.0 | 5.0 | 10.0 | 19.0 | 70.0 |
df.groupby('Platform')['Average Page Depth'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Platform | ||||||||
| App | 295.0 | 31.458975 | 12.634395 | 1.0 | 23.75 | 31.0 | 38.233333 | 62.0 |
| Desktop | 510.0 | 18.527578 | 11.643690 | 0.0 | 8.50 | 18.0 | 25.000000 | 57.5 |
| Mobile Web | 197.0 | 20.155089 | 11.715889 | 1.0 | 11.00 | 20.5 | 26.000000 | 61.0 |
df.groupby('Platform')['Reloads'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Platform | ||||||||
| App | 295.0 | 0.837288 | 1.160855 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
| Desktop | 510.0 | 0.784314 | 1.166219 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
| Mobile Web | 197.0 | 0.730964 | 1.031991 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
selected_features_5 = df[['event_count', 'page_loads','Searches', "Navigation_Difficulties",
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']].dropna()
selected_features_5['Navigation_Difficulties'] = pd.factorize(selected_features_5["Navigation_Difficulties"])[0]
Inference: We can observe from the above plot that Average Page Depth, event_count, Time Spent per Visit, Page Views and Page loads have higher correlation with the Navigation Difficultiles.
#Let's visualize correlations
plt.figure(figsize=(20,10))
cor = selected_features_5.corr()
heatmap = sns.heatmap(cor
, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlation with Navigation Difficulties', fontdict={'fontsize':18}, pad=16);
Further Elaboration of Correlation:
The correlation coefficient has values between -1 to 1.
A value closer to 0 implies weaker correlation (exact 0 implying no correlation)
A value closer to 1 implies stronger positive correlation
A value closer to -1 implies stronger negative correlation
#Correlation with output variable
cor_target = abs(cor['Navigation_Difficulties'])
#Selecting highly correlated features
relevant_features = cor_target.sort_values(ascending = False)
## Top 6 featurs with high correlation
relevant_features[:6]
Navigation_Difficulties 1.000000 event_count 0.082212 Average Page Depth 0.062015 Searches 0.037833 Time Spent per Visit (seconds) 0.015807 Page Events 0.012196 Name: Navigation_Difficulties, dtype: float64
### Set survey category as target variable
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.ensemble import RandomForestClassifier
# define the model
model = RandomForestClassifier()
# fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
# summarize feature importance
imp = pd.DataFrame()
imp["var"] = X.columns
importance = model.feature_importances_
imp["imp"] = importance
imp = imp.sort_values("imp", ascending = False).reset_index(drop = True)
# PRINT OUT THE DATA FRAME
imp.sort_values(by = 'imp', ascending = True).plot.barh(x='var', y='imp', ).figsize=(15, 5)
plt.title("Feature importance plot for Random Forest Model (Navigation Difficulties)")
Accuracy score for training data 0.9987515605493134 Accuracy score for test data 0.5223880597014925
Text(0.5, 1.0, 'Feature importance plot for Random Forest Model (Navigation Difficulties)')
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
0 0.52 0.66 0.58 101
1 0.53 0.38 0.44 100
accuracy 0.52 201
macro avg 0.52 0.52 0.51 201
weighted avg 0.52 0.52 0.51 201
<AxesSubplot:>
### Set survey category as target variable
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.neighbors import KNeighborsClassifier
# define the model
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# summarize feature importance
iris = pd.DataFrame()
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
knn = KNeighborsClassifier(n_neighbors=3)
efs1 = EFS(knn,
min_features=1,
max_features=4,
scoring='accuracy',
print_progress=True,
cv=5)
efs1 = efs1.fit(X, y)
print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)
Features: 162/162
Best accuracy score: 0.57
Best subset (indices): (2, 3, 5, 6)
Best subset (corresponding names): ('Searches', 'Page Events', 'Average Page Depth', 'Reloads')
y_pred_tr = model.predict(X_train)
print("Accuracy score for training data", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data", accuracy_score(y_test, y_pred, normalize=True))
Accuracy score for training data 0.9987515605493134 Accuracy score for test data 0.5223880597014925
# Compute confusion matrix
from sklearn.metrics import *
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print (classification_report(y_test, y_pred))
plt.figure(figsize=(9,6))
sns.heatmap(cnf_matrix,annot=True, annot_kws={"size": 16}, fmt="d", cmap=plt.cm.Reds)
precision recall f1-score support
0 0.52 0.66 0.58 101
1 0.53 0.38 0.44 100
accuracy 0.52 201
macro avg 0.52 0.52 0.51 201
weighted avg 0.52 0.52 0.51 201
<AxesSubplot:>
df.groupby('Navigation_Difficulties')['Time Spent per Visit (seconds)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 503.585683 | 580.211663 | 0.0 | 79.0 | 258.0 | 761.0 | 2192.0 |
| Yes | 541.0 | 486.053604 | 529.292070 | 0.0 | 102.0 | 270.0 | 699.0 | 2236.0 |
df.groupby('Navigation_Difficulties')['Average Page Depth'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Navigation_Difficulties | ||||||||
| No | 461.0 | 21.765291 | 13.471723 | 0.0 | 10.5 | 21.5 | 29.5 | 62.0 |
| Yes | 541.0 | 23.412598 | 13.014742 | 0.0 | 15.0 | 22.5 | 31.0 | 61.0 |
Note: Sequential Feature Selector function in sklearn can be used to find significant features for predictive model, we’ll Random Forest Model since we are working on a classification problem where we have to predict the possibility of navigation difficulties.
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
## k_features tells us how many features should be selected.
#We’ve passed 4 so the model will train until 4 features are selected.
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
RF = RandomForestClassifier()
fs1 = sfs(RF, k_features=4, forward=True, verbose=2, scoring='balanced_accuracy')
fs1 = fs1.fit(X, y)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.9s remaining: 0.0s [Parallel(n_jobs=1)]: Done 8 out of 8 | elapsed: 16.0s finished [2022-08-05 02:00:35] Features: 1/4 -- score: 0.54217240710479[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 2.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 7 out of 7 | elapsed: 15.3s finished [2022-08-05 02:00:50] Features: 2/4 -- score: 0.5488806610308341[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 2.3s remaining: 0.0s [Parallel(n_jobs=1)]: Done 6 out of 6 | elapsed: 13.7s finished [2022-08-05 02:01:04] Features: 3/4 -- score: 0.5437098823413916[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 2.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 12.7s finished [2022-08-05 02:01:17] Features: 4/4 -- score: 0.5576166780803475
feat_names = list(fs1.k_feature_names_)
print(feat_names)
['Searches', 'Page Events', 'Average Page Depth', 'Time Spent per Visit (seconds)']
### Set survey category as target variable
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.neighbors import KNeighborsClassifier
# define the model
from sklearn.datasets import load_iris
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# summarize feature importance
iris = pd.DataFrame()
X = selected_features_5.drop(['Navigation_Difficulties'], axis = 1)
y = selected_features_5['Navigation_Difficulties']
knn = KNeighborsClassifier(n_neighbors=3)
efs1 = EFS(knn,
min_features=1,
max_features=4,
scoring='accuracy',
print_progress=True,
cv=5)
efs1 = efs1.fit(X, y)
print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)
Features: 162/162
Best accuracy score: 0.57
Best subset (indices): (2, 3, 5, 6)
Best subset (corresponding names): ('Searches', 'Page Events', 'Average Page Depth', 'Reloads')
### Set survey category as target variable
X = selected_features.drop('TA', axis = 1)
y = selected_features['TA']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.neighbors import KNeighborsClassifier
# define the model
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# summarize feature importance
iris = pd.DataFrame()
X = selected_features.drop('TA', axis = 1)
y = selected_features['TA']
knn = KNeighborsClassifier(n_neighbors=3)
efs1 = EFS(knn,
min_features=1,
max_features=4,
scoring='accuracy',
print_progress=True,
cv=5)
efs1 = efs1.fit(X, y)
print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)
Features: 255/255
Best accuracy score: 0.63
Best subset (indices): (0,)
Best subset (corresponding names): ('called_flag',)
import matplotlib.pyplot as plt
metric_dict = efs1.get_metric_dict()
fig = plt.figure()
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]
upper, lower = [], []
for k in k_feat:
upper.append(metric_dict[k]['avg_score'] +
metric_dict[k]['std_dev'])
lower.append(metric_dict[k]['avg_score'] -
metric_dict[k]['std_dev'])
plt.fill_between(k_feat,
upper,
lower,
alpha=0.2,
color='blue',
lw=1)
plt.plot(k_feat, avg, color='blue', marker='o')
plt.ylabel('Accuracy +/- Standard Deviation')
plt.xlabel('Number of Features')
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
plt.xticks(k_feat,
[str(metric_dict[k]['feature_names']) for k in k_feat],
rotation=90)
plt.show()
df5 = pd.read_csv("error_codes.csv")
df5.head()
| APP_SESSION_ID | FORESEE_SESSION | ERROR_CODES | |
|---|---|---|---|
| 0 | c222a3fb-c62d-4710-ad66-49b2580ba9d1 | 9d314b30-9abf-4e77-9b3e-1257b823b88d | REG_BE_200,REG_BV_002 |
| 1 | g0m8IoKwE8gPtVA7VnuIHdVYBifI6crJCST3e4iI1Dc | 24e04221-cf77-432e-84b6-b62311051f68 | PFS10006,201,MS_ORCH_FID_LE_BE_01_OK |
| 2 | _z0RK7lnruN9USCJuu7J3WJ0zCGCM4D9g0LKCu6peU4 | e98ffd19-2b5d-434f-9dc6-9d06d4374328 | PMS10002,PMS10010,-2 |
| 3 | ZLKJSx4fSoFlDMcMzfsChpIPVM18GZmAugzmNiHV1l0 | 3362eee1-62cd-4506-803c-3e22a43a495f | -2,MSPMT_E200101,1 |
| 4 | 917nZt30odWeiH5XORj_aWE9LWn-LHlyhmIwmBVRS6k | 5e73673a-7df1-48de-a261-209404f9c499 | 1,LGN_02_07 |
df6 = pd.read_csv("searches.csv")
df6.head()
| foresee_session | search_terms | |
|---|---|---|
| 0 | 0048f7c6-870d-4929-9585-c15d817bd4a3 | where can i find the password for my router |
| 1 | 007e2653-07cc-43b4-af52-b1fbd695c7df | i do not have auto pay and some one debited ... |
| 2 | 0089b782-3aad-4294-aef4-bda8957615c5 | recycle equipment |
| 3 | 0094f7bf-0c6b-4e9c-96bc-4f877c36d662 | internet security suite download, internet ... |
| 4 | 00ad6f19-4e16-4515-8bb7-f3d3fcfafb80 | att sent me a text saying that you are givin... |
merged = pd.merge(left=df5, right=df6, left_on='FORESEE_SESSION', right_on='foresee_session')
df = pd.merge(left=merged, right=df, left_on='FORESEE_SESSION', right_on='foresee_session_id')
df
| APP_SESSION_ID | FORESEE_SESSION | ERROR_CODES | foresee_session | search_terms | foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | Search | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | jL9BaVnPvstmKC1wZVmcS8AgY00ewZlWiPXHKLElTa4 | 1906c7ac-8998-47d3-b2e3-e3dec83a6965 | 0,-2 | 1906c7ac-8998-47d3-b2e3-e3dec83a6965 | appointment | ktSsvYnsVJzZkAq3KuvfDhRfgrle9N0e | 1906c7ac-8998-47d3-b2e3-e3dec83a6965 | 7.722781e+09 | 1/30/22 | Support | BB | Mobile Web | Other | 0.00 | No | 1 | Yes | 1 | 0 | jL9BaVnPvstmKC1wZVmcS8AgY00ewZlWiPXHKLElTa4 | 36.0 | 11.0 | 5.0 | 4.0 | /acctmgmt/accountoverview, /search, /support/a... | 0.0 | 2.0 | 1.0 | 15.000000 | 0.0 | 830.0 | Didn't use search |
| 1 | HzlpveI8jY02EbaRBW7gK6Z29KvwdC2PnxgTeeBka6E | bb8e44e7-fee1-42ca-b629-b272364e1cd5 | 0,1 | bb8e44e7-fee1-42ca-b629-b272364e1cd5 | signature program enrollment, fan 5339885 | O5b6UHUsmuefVSGXNYZniWfYP6HB02yu | bb8e44e7-fee1-42ca-b629-b272364e1cd5 | 7.366163e+09 | 1/7/22 | Account Management | Wireless | App | Other | 40.74 | No | 2 | Yes | 2 | 0 | HzlpveI8jY02EbaRBW7gK6Z29KvwdC2PnxgTeeBka6E | 30.0 | 9.0 | 3.0 | 5.0 | /my/virtual/accountoverview, /verification/sig... | 0.0 | 65.0 | 21.0 | 38.000000 | 0.0 | 428.0 | Didn't use search |
| 2 | zPHphjHODqTvcVY6ZCr2bLRH3CrgSNX1y8Ud92QzSkc | 5a8f4cdc-bd9a-41db-aaea-35e5256b2885 | -2 | 5a8f4cdc-bd9a-41db-aaea-35e5256b2885 | discount | s4QUwN5khglNo9cExAcZxQ4C | 5a8f4cdc-bd9a-41db-aaea-35e5256b2885 | 6.602690e+09 | 1/25/22 | Account Management | Wireless | Desktop | Other | 0.00 | No | 1 | Yes | 1 | 0 | zPHphjHODqTvcVY6ZCr2bLRH3CrgSNX1y8Ud92QzSkc | 45.0 | 12.0 | 12.0 | 4.0 | /acctmgmt/tobr/transfersummary, /verification/... | 0.0 | 10.0 | 3.0 | 18.000000 | 0.0 | 177.0 | Didn't use search |
| 3 | o398yfrcnWmqcfbyWFzQlt-vndv1pKXMBCqweEcAEBo | 6c085f25-90c4-45a3-ae83-61bb8276e5df | 205.4,205.2,MSFIDACL08,MSFIDELAG00 | 6c085f25-90c4-45a3-ae83-61bb8276e5df | email login | ZgpsJJ4ARx98goU0VIV1Rw4C | 6c085f25-90c4-45a3-ae83-61bb8276e5df | 6.865271e+09 | 1/22/22 | Support | BB | Desktop | Other | 0.00 | No | 1 | Yes | 1 | 0 | o398yfrcnWmqcfbyWFzQlt-vndv1pKXMBCqweEcAEBo | 32.0 | 5.0 | 2.0 | 2.0 | /search, , /acctmgmt/fid/emailaccount, /acctmg... | 0.0 | 15.0 | 13.0 | 55.692308 | 1.0 | 1605.0 | Didn't use search |
| 4 | fM-dl2aLtzdeJGhecUsSXwb1VZBrbLx07qsVLZ0fyL8 | 07d09594-21a1-4490-9f16-6694053a1eb6 | 0 | 07d09594-21a1-4490-9f16-6694053a1eb6 | how do i print my statement | 3594e2gPZLuuTOY3dJ3hYI78VfIP81Y8 | 07d09594-21a1-4490-9f16-6694053a1eb6 | 6.445448e+09 | 1/18/22 | Account Management | Wireless | App | Other | 0.00 | No | 1 | No | 1 | 0 | fM-dl2aLtzdeJGhecUsSXwb1VZBrbLx07qsVLZ0fyL8 | 21.0 | 5.0 | 3.0 | 6.0 | /search, , /acctmgmt/billandpay, /support/topi... | 0.0 | 82.0 | 18.0 | 27.500000 | 1.0 | 762.0 | Didn't use search |
| 5 | E-6xuKM1nbXOp_fMldltQKEXUGzrVvLS1mUB929yfhk | ed04f7f0-ff55-4493-ba21-199c094143d6 | -2 | ed04f7f0-ff55-4493-ba21-199c094143d6 | connectivity program, connectivity program | BNpSJFtw6ehmqwzBV4QEMOv4asd1mYC6 | ed04f7f0-ff55-4493-ba21-199c094143d6 | 7.783611e+09 | 2/18/22 | Support | Wireless | Mobile Web | Other | 22.22 | Yes | 1 | Yes | 3 | 0 | E-6xuKM1nbXOp_fMldltQKEXUGzrVvLS1mUB929yfhk | 37.0 | 11.0 | 10.0 | 9.0 | /acctmgmt/accountoverview, /help/affordable-co... | 0.0 | 6.0 | 3.0 | 26.000000 | 0.0 | 30.0 | Didn't use search |
| 6 | UP19i0Ypn-zxk2HsOPDfhzS4gGjS2jY-dFX7iid7a5E | b0bf9b86-faca-4609-bdf6-edf9659f9875 | 201,-2 | b0bf9b86-faca-4609-bdf6-edf9659f9875 | reward card | EhYRJ158VYYpRI00EVY8RA4C | b0bf9b86-faca-4609-bdf6-edf9659f9875 | 7.698561e+09 | 1/16/22 | Account Management | BB | Desktop | Other | 88.89 | Yes | 10 | No | 10 | 0 | UP19i0Ypn-zxk2HsOPDfhzS4gGjS2jY-dFX7iid7a5E | 16.0 | 5.0 | 3.0 | 2.0 | /search, , /myfavorites, /my/virtual/verifyema... | 0.0 | 24.0 | 14.0 | 7.500000 | 1.0 | 1142.0 | Didn't use search |
| 7 | 3VrbsgEz4boyORWbh9b5ReUKOnXJa7DOKPZWpAKuWzs | ccd008d5-ad91-481d-af17-453072004f86 | 0,LGN_02_07,-2 | ccd008d5-ad91-481d-af17-453072004f86 | feedback, feedback, complaints, feedback,... | NbXigqh82nmJX2KVgPKxZ76G5dNKGr1g | ccd008d5-ad91-481d-af17-453072004f86 | 7.702964e+09 | 1/18/22 | Account Management | Wireless | Mobile Web | Other | 0.00 | No | 1 | No | 1 | 0 | 3VrbsgEz4boyORWbh9b5ReUKOnXJa7DOKPZWpAKuWzs | 90.0 | 26.0 | 17.0 | 10.0 | /acctmgmt/fpwd/selectmethod, /acctmgmt/fpwd/re... | 4.0 | 21.0 | 8.0 | 45.500000 | 2.0 | 149.0 | Used Search |
| 8 | I-tIoLOPqfuCmAc5qCFv6-tMcvOAWT-xpSRVQdXgduY | 02803df1-5eb1-4608-bd94-58dc7bc28128 | -2,1,201 | 02803df1-5eb1-4608-bd94-58dc7bc28128 | unlock phone | zVV7pXpwnNRh0kLY4FBMu1tBg5aPZQ9Z | 02803df1-5eb1-4608-bd94-58dc7bc28128 | 7.704693e+09 | 1/21/22 | Account Management | Wireless | Mobile Web | Other | 77.78 | Yes | 5 | No | 8 | 0 | I-tIoLOPqfuCmAc5qCFv6-tMcvOAWT-xpSRVQdXgduY | 65.0 | 20.0 | 14.0 | 9.0 | /deviceunlock/unlockstep2, /deviceunlock/unloc... | 0.0 | 3.0 | 4.0 | 33.500000 | 0.0 | 155.0 | Didn't use search |
| 9 | cMAfnuFfxIjFfTHj1En3Na2JdyeV2v3jeogTufcz6FI | f07cb7de-484e-4d87-8568-bd51d9d78653 | 0,-2 | f07cb7de-484e-4d87-8568-bd51d9d78653 | i wanna change my wi-fi password, i wanna c... | 7gB8G8bxzPnjW93tCZE8seksISZDvrdA | f07cb7de-484e-4d87-8568-bd51d9d78653 | 7.751387e+09 | 2/3/22 | Support | BB | Mobile Web | Other | 0.00 | No | 1 | Yes | 1 | 0 | cMAfnuFfxIjFfTHj1En3Na2JdyeV2v3jeogTufcz6FI | 73.0 | 18.0 | 10.0 | 14.0 | /buy/phones, /, /buy/cart, /support/contact-us... | 0.0 | 70.0 | 19.0 | 22.000000 | 2.0 | 980.0 | Didn't use search |
| 10 | _T4kw6QzgvNiBxFMBC7z8tsQ79YgDdmfUtfRVrpTnuc | 791dea66-a9b2-438a-a290-2a05569156bc | 1 | 791dea66-a9b2-438a-a290-2a05569156bc | contact information, contact information, ... | Q9IptUUENQc8sNEt8xwIYw4C | 791dea66-a9b2-438a-a290-2a05569156bc | 7.625703e+09 | 1/17/22 | Support | Wireless | Desktop | Other | 0.00 | No | 5 | Yes | 1 | 0 | _T4kw6QzgvNiBxFMBC7z8tsQ79YgDdmfUtfRVrpTnuc | 71.0 | 13.0 | 16.0 | 12.0 | /acctmgmt/accountoverview, /support/topic/my-a... | 3.0 | 19.0 | 6.0 | 17.500000 | 2.0 | 946.0 | Used Search |
| 11 | 83b804ad-6e79-4e07-ace6-a25e44b92fe9 | 80877da0-92a3-488d-86bf-849ed5c32ac4 | -2,1 | 80877da0-92a3-488d-86bf-849ed5c32ac4 | unlimi | SWfVQxtl8cLm9ZfSsWgyPJO2bdboYjTZ | 80877da0-92a3-488d-86bf-849ed5c32ac4 | 7.795247e+09 | 2/20/22 | Support | Wireless | Mobile Web | Other | 96.30 | Yes | 5 | No | 9 | 0 | 83b804ad-6e79-4e07-ace6-a25e44b92fe9 | 74.0 | 26.0 | 14.0 | 14.0 | /support/article/, /acctmgmt/wireless/device/d... | 0.0 | 23.0 | 11.0 | 26.000000 | 2.0 | 353.0 | Didn't use search |
| 12 | _tPuRpH78pcP09ZP26I-2SkvU3HqkV_VsPqzQyRlh8M | d1773bca-bc5d-4491-a4e9-79ba1d4828f9 | -2,1 | d1773bca-bc5d-4491-a4e9-79ba1d4828f9 | employee and retiree discount | 4ohx98RxoAwQoNxo8dV95A4C | d1773bca-bc5d-4491-a4e9-79ba1d4828f9 | 5.509732e+09 | 2/12/22 | Account Management | Wireless | Desktop | Other | 62.96 | Yes | 6 | Yes | 7 | 0 | _tPuRpH78pcP09ZP26I-2SkvU3HqkV_VsPqzQyRlh8M | 56.0 | 12.0 | 5.0 | 9.0 | /acctmgmt/accountoverview, /acctmgmt/registrat... | 0.0 | 16.0 | 6.0 | 6.666667 | 2.0 | 0.0 | Didn't use search |
| 13 | 2333560d-298e-4818-b7ac-9236cb833126 | 11e5f7e0-d9a6-4aff-9f63-e2ae7bdab222 | 0,-2 | 11e5f7e0-d9a6-4aff-9f63-e2ae7bdab222 | receive free at&t cell booster because of up... | cEyH7o69Xvu49ImoxoC2bzSDoNi5OGt7 | 11e5f7e0-d9a6-4aff-9f63-e2ae7bdab222 | 7.784615e+09 | 2/14/22 | Support | Wireless | Mobile Web | Other | 0.00 | No | 2 | Yes | 1 | 0 | 2333560d-298e-4818-b7ac-9236cb833126 | 73.0 | 10.0 | 1.0 | 12.0 | /support/article/, /support/, /search, /Accept... | 0.0 | 30.0 | 10.0 | 28.500000 | 1.0 | 392.0 | Didn't use search |
| 14 | 2761eb41-90e2-45b7-b2a6-a6ab5f5b9973 | ab023c8d-26f5-4a4a-a7fb-0888ed06d5c1 | 205.4 | ab023c8d-26f5-4a4a-a7fb-0888ed06d5c1 | anti virus | lpQ599xZ4sEckYwksMstMA4C | ab023c8d-26f5-4a4a-a7fb-0888ed06d5c1 | 7.759407e+09 | 2/8/22 | Support | BB | Desktop | Other | 0.00 | No | 1 | Yes | 1 | 0 | 2761eb41-90e2-45b7-b2a6-a6ab5f5b9973 | 37.0 | 9.0 | 6.0 | 14.0 | /acctmgmt/registration/serviceselection, /supp... | 0.0 | 20.0 | 9.0 | 34.555556 | 1.0 | 128.0 | Didn't use search |
| 15 | cv_AW5zy7vDpK0N-WbGD6mYWe8yeJ-fVCHiHfpd9wjU | f4318982-e731-48a3-8b79-18bdde01431c | 0 | f4318982-e731-48a3-8b79-18bdde01431c | report text spam | pEcrUJKOe3JDUyrjcrDH4U0VPnn3XI1g | f4318982-e731-48a3-8b79-18bdde01431c | 7.177417e+09 | 1/11/22 | Support | Wireless | App | Other | 100.00 | Yes | 9 | No | 10 | 0 | cv_AW5zy7vDpK0N-WbGD6mYWe8yeJ-fVCHiHfpd9wjU | 27.0 | 9.0 | 3.0 | 3.0 | /support/article/, , /myatt/index-native.html,... | 0.0 | 5.0 | 2.0 | 38.500000 | 0.0 | 123.0 | Didn't use search |
| 16 | dffd3255-a5cc-4836-b41c-9e0bbca4b9a7 | d8083682-4e02-4339-ab63-6f2afb989e98 | 1,-2 | d8083682-4e02-4339-ab63-6f2afb989e98 | how to get passcode for smart home manager | Z3AMLQdh9WFv9wq2W3kjvSHLkVRxX8Ot | d8083682-4e02-4339-ab63-6f2afb989e98 | 7.685185e+09 | 1/9/22 | Support | BB | Mobile Web | Other | 100.00 | No | 10 | Yes | 5 | 0 | dffd3255-a5cc-4836-b41c-9e0bbca4b9a7 | 47.0 | 7.0 | 9.0 | 8.0 | /internet/smart-home/, /acctmgmt/accountovervi... | 0.0 | 14.0 | 7.0 | 14.000000 | 0.0 | 1232.0 | Didn't use search |
| 17 | UBGgxIHqLQ7UsBS4Png4SBSQE43dgj5FL3lli37DiC4 | 5b2d10c4-4176-4a11-a9cb-e307e4b3b764 | 0,201,1 | 5b2d10c4-4176-4a11-a9cb-e307e4b3b764 | transfer number to another carrier, unlock | lF9xt8IYF0MgwAsYB05ddw4C | 5b2d10c4-4176-4a11-a9cb-e307e4b3b764 | 7.671234e+09 | 1/1/22 | Account Management | Wireless | Desktop | Other | 74.07 | Yes | 4 | No | 6 | 0 | UBGgxIHqLQ7UsBS4Png4SBSQE43dgj5FL3lli37DiC4 | 36.0 | 13.0 | 3.0 | 5.0 | /deviceunlock/thankyou, /deviceunlock/unlockst... | 0.0 | 23.0 | 13.0 | 11.000000 | 1.0 | 608.0 | Didn't use search |
| 18 | PMs-aWqwFXHOKlLXV19HesSJv4o7LmpMnlAet4Z5dWA | 5e9589fa-2a08-49f6-a34e-cef2de2e0f5e | 902,201,-2,1 | 5e9589fa-2a08-49f6-a34e-cef2de2e0f5e | microcell | osg5cAVEZlY4IJNRg9lNog4C | 5e9589fa-2a08-49f6-a34e-cef2de2e0f5e | 7.777070e+09 | 2/14/22 | Support | Wireless | Desktop | Other | 0.00 | No | 10 | Yes | 1 | 0 | PMs-aWqwFXHOKlLXV19HesSJv4o7LmpMnlAet4Z5dWA | 64.0 | 11.0 | 13.0 | 12.0 | /search, /acctmgmt/accountoverview, /support/,... | 0.0 | 13.0 | 3.0 | 28.000000 | 2.0 | 588.0 | Didn't use search |
| 19 | QjYiFC3jKCcXS6ZoDSaCUiQQq9yrGckXxm-0jsGyPi8 | 921b5dc2-fc76-4752-a3ee-d9c42f4bb7cd | 902,205.4,-2 | 921b5dc2-fc76-4752-a3ee-d9c42f4bb7cd | security suite download | dkJIAIYYkwBcMUQpE9IAgw4C | 921b5dc2-fc76-4752-a3ee-d9c42f4bb7cd | 7.699021e+09 | 1/16/22 | Support | BB | Desktop | Other | 0.00 | No | 1 | Yes | 1 | 0 | QjYiFC3jKCcXS6ZoDSaCUiQQq9yrGckXxm-0jsGyPi8 | 41.0 | 12.0 | 7.0 | 8.0 | /search, , /support/article/, /, /internet/fib... | 1.0 | 10.0 | 2.0 | 43.000000 | 1.0 | 668.0 | Used Search |
| 20 | 00rGvGm8YUxMOpP5otOR3J_9SarR7h7oCmLzvqbBFwA | 6ea46be0-9109-4714-a3e0-8e585482e943 | 0,1,201,-2 | 6ea46be0-9109-4714-a3e0-8e585482e943 | suspend | 9hwc1pENNNAAYsgc40QQRw4C | 6ea46be0-9109-4714-a3e0-8e585482e943 | 6.916094e+09 | 1/31/22 | Support | Wireless | Desktop | Other | 85.19 | Yes | 8 | No | 7 | 0 | 00rGvGm8YUxMOpP5otOR3J_9SarR7h7oCmLzvqbBFwA | 68.0 | 17.0 | 14.0 | 8.0 | , /support/smallbusiness/article/change-device... | 0.0 | 13.0 | 5.0 | 18.000000 | 0.0 | 993.0 | Didn't use search |
| 21 | 1l1kNzw_sEzrMJXBzx7bXLke44PI_Ohd0HUDMlg4LxA | 8be56008-2d38-4a37-884b-54310af79033 | 1,-2 | 8be56008-2d38-4a37-884b-54310af79033 | international | MNodhYcNAtZ8coUd1BgMtQ4C | 8be56008-2d38-4a37-884b-54310af79033 | 7.671885e+09 | 1/1/22 | Account Management | Wireless | Desktop | Other | 77.78 | Yes | 8 | No | 7 | 0 | 1l1kNzw_sEzrMJXBzx7bXLke44PI_Ohd0HUDMlg4LxA | 36.0 | 12.0 | 11.0 | 4.0 | /international/day-pass/, /acctmgmt/wireless/d... | 4.0 | 26.0 | 15.0 | 8.000000 | 1.0 | 1414.0 | Used Search |
| 22 | n3G5vbxNCES_TJGB4k-UCaxJ6jTO-lJ6hDtUr3yaW_k | 0e98288d-227b-4722-ab70-faa2b2766c7d | -2 | 0e98288d-227b-4722-ab70-faa2b2766c7d | chat | 2LjGjDtB4ZJp1YnOsdBlPrA73U0H01Gv | 0e98288d-227b-4722-ab70-faa2b2766c7d | 7.786353e+09 | 2/16/22 | Account Management | Wireless | Mobile Web | Other | 44.44 | No | 5 | No | 3 | 0 | n3G5vbxNCES_TJGB4k-UCaxJ6jTO-lJ6hDtUr3yaW_k | 38.0 | 10.0 | 5.0 | 7.0 | /acctmgmt/makepayment, /acctmgmt/billandpay/hi... | 0.0 | 11.0 | 5.0 | 17.000000 | 2.0 | 875.0 | Didn't use search |
| 23 | -tgPaLs2YjGV23uoZ4jw2_SEQjstr4PLF8r65A7K_G4 | 2aae6e7d-77ee-4be9-ab6f-1ef7aea637e3 | 902,0 | 2aae6e7d-77ee-4be9-ab6f-1ef7aea637e3 | mcell replacement, mcell replacement, mcel... | s9IBwMwxEAI9Btpg0F8N1g4C | 2aae6e7d-77ee-4be9-ab6f-1ef7aea637e3 | 7.754894e+09 | 2/5/22 | Support | Wireless | Desktop | Other | 37.04 | No | 8 | Yes | 6 | 0 | #NAME? | 51.0 | 11.0 | 7.0 | 11.0 | /checkmyorder/omhub.rt, /haloc/virtual/login/e... | 0.0 | 6.0 | 5.0 | 9.800000 | 0.0 | 939.0 | Didn't use search |
| 24 | pMuJTC87rmF-ss0YhU4K00JIa6VPYmdsD7F5sW1UyS4 | f720b92f-124a-4bcd-815b-7be035e2df55 | -2,1 | f720b92f-124a-4bcd-815b-7be035e2df55 | att fiber | YgcZQlxZRxgh4JIkl4wZEw4C | f720b92f-124a-4bcd-815b-7be035e2df55 | 7.531885e+09 | 1/26/22 | Support | BB | Desktop | Other | 29.63 | No | 5 | Yes | 5 | 0 | pMuJTC87rmF-ss0YhU4K00JIa6VPYmdsD7F5sW1UyS4 | 59.0 | 17.0 | 10.0 | 15.0 | /search, /buy/broadband/existing-service/modal... | 0.0 | 49.0 | 34.0 | 52.029412 | 4.0 | 1731.0 | Didn't use search |
| 25 | 4f73c455-d669-47ea-abb4-497426b5f7cb | 59e1e72c-f113-4db4-8fff-96b9a622d616 | -2,0,ERR0003 | 59e1e72c-f113-4db4-8fff-96b9a622d616 | track technician | Oq8vp5QUsrShT0DQKizjK0kE0suyGqLh | 59e1e72c-f113-4db4-8fff-96b9a622d616 | 7.798002e+09 | 2/21/22 | Support | BB | Mobile Web | Other | 51.85 | No | 3 | Yes | 4 | 0 | 4f73c455-d669-47ea-abb4-497426b5f7cb | 45.0 | 12.0 | 3.0 | 10.0 | /tsrhs/virtual/smartphone/preflow/welcometotro... | 0.0 | 11.0 | 2.0 | 19.500000 | 0.0 | 115.0 | Didn't use search |
| 26 | 2e033f96-c8b2-49be-afbf-c7fb386ad0f6 | b50b7e65-ff77-4a0e-973f-c2746eda04d9 | 1,-2 | b50b7e65-ff77-4a0e-973f-c2746eda04d9 | voice mail | No4cw11JV5hZYBd9hs48Ig4C | b50b7e65-ff77-4a0e-973f-c2746eda04d9 | 7.780587e+09 | 2/11/22 | Account Management | Wireless | Desktop | Other | 88.89 | Yes | 7 | No | 9 | 0 | 2e033f96-c8b2-49be-afbf-c7fb386ad0f6 | 68.0 | 15.0 | 20.0 | 13.0 | /support/article/, , /my/virtual/contactinfo, ... | 0.0 | 7.0 | 3.0 | 18.000000 | 0.0 | 696.0 | Didn't use search |
| 27 | -p4MuMP4ln6lfeiGrfBRWjvT1LxItz0oBLTFHasQUJk | 42c605eb-bbec-4778-a46e-f37afff76c36 | -2,201 | 42c605eb-bbec-4778-a46e-f37afff76c36 | technical support | SVgn6udhpqgBxAMNC3utqM2xYenTXdN7 | 42c605eb-bbec-4778-a46e-f37afff76c36 | 7.680895e+09 | 1/12/22 | Support | Wireless | Mobile Web | Other | 0.00 | No | 1 | No | 1 | 0 | #NAME? | 60.0 | 9.0 | 9.0 | 7.0 | /search, , /support/ts/wireless/phonecalls, /d... | 0.0 | 6.0 | 1.0 | 11.000000 | 0.0 | 15.0 | Didn't use search |
| 28 | J4rcrzGtv9dL4iLsuRpou0xg3PYJcmsHiDf3OAsSL1c | f3d5447f-9b91-4369-ae3d-6a2e21b8dcd6 | 0 | f3d5447f-9b91-4369-ae3d-6a2e21b8dcd6 | why does my bill keep getting higher | mVQUU3Q4TyWGF4cMKINI218SAXwxLpio | f3d5447f-9b91-4369-ae3d-6a2e21b8dcd6 | 5.611405e+09 | 2/10/22 | Account Management | Wireless | App | Other | 40.74 | No | 4 | Yes | 4 | 0 | J4rcrzGtv9dL4iLsuRpou0xg3PYJcmsHiDf3OAsSL1c | 108.0 | 22.0 | 16.0 | 11.0 | /search, /acctmgmt/error, /, /support/topic/my... | 0.0 | 57.0 | 2.0 | 27.500000 | 0.0 | 283.0 | Didn't use search |
| 29 | pWO7zeo9GvOy0tLulWgXAE8KhTykpk3GJKsfGVP6Je8 | c347b144-4632-462a-b1f0-dafd5c5469e9 | 1,-2 | c347b144-4632-462a-b1f0-dafd5c5469e9 | transfer phone pin, transfer to different c... | 1NJNIhpkIl5sQQB9QUk04g4C | c347b144-4632-462a-b1f0-dafd5c5469e9 | 7.716904e+09 | 1/26/22 | Account Management | Wireless | Desktop | Other | 66.67 | No | 2 | Yes | 5 | 0 | pWO7zeo9GvOy0tLulWgXAE8KhTykpk3GJKsfGVP6Je8 | 75.0 | 20.0 | 17.0 | 13.0 | /acctmgmt/registration/confirmation, /acctmgmt... | 0.0 | 29.0 | 8.0 | 21.500000 | 0.0 | 397.0 | Didn't use search |
| 30 | GdzmzDC5wPCNAR7wpyXb3-V7gdjbB2ZmmfaK-lneDRQ | cd52db5e-a581-4950-9e84-8ddcd1353549 | 1,201,0,-2 | cd52db5e-a581-4950-9e84-8ddcd1353549 | why isn't my mobile phone connected to internet | 8cVIJ01Zc1khw5dxg8x8RQ4C | cd52db5e-a581-4950-9e84-8ddcd1353549 | 7.505438e+09 | 1/11/22 | Support | BB | Desktop | Other | 33.33 | No | 1 | Yes | 1 | 0 | GdzmzDC5wPCNAR7wpyXb3-V7gdjbB2ZmmfaK-lneDRQ | 73.0 | 15.0 | 23.0 | 15.0 | /support/, /search, /support/topic/u-verse-hig... | 0.0 | 2.0 | 1.0 | 23.000000 | 0.0 | 1466.0 | Didn't use search |
| 31 | SvNDLOpvseJJbQECWCyVtR-6CVh8UO48xOkKSEoUT6s | 0582e189-b3c4-4fc0-b8bf-25b904d24444 | -2,1 | 0582e189-b3c4-4fc0-b8bf-25b904d24444 | direct tv discount | wtMosdoog19sVAQlNUAZJQ4C | 0582e189-b3c4-4fc0-b8bf-25b904d24444 | 7.684402e+09 | 1/12/22 | Account Management | Wireless | Desktop | Other | 85.19 | Yes | 8 | No | 7 | 0 | SvNDLOpvseJJbQECWCyVtR-6CVh8UO48xOkKSEoUT6s | 68.0 | 20.0 | 9.0 | 10.0 | , /my/virtual/attcomm, /bundles/, /buy/postpai... | 0.0 | 3.0 | 1.0 | 23.000000 | 0.0 | 68.0 | Didn't use search |
df.to_csv('Merged_file.csv', index = False)
def clean_text(text):
#will replace the html characters with " "
text=re.sub('<.*?>', ' ', text)
text = re.sub('[^a-zA-Z]',' ',text)
#will replace newline with space
text = re.sub("\n"," ",text)
#will convert to lower case
text = text.lower()
# will split and join the words
text=' '.join(text.split())
return text
### Apply data cleaning on text feature in our data
df6['search_terms'] = df6['search_terms'].apply(lambda x : clean_text(x))
df6['search_terms']
0 where can i find the password for my router
1 i do not have auto pay and some one debited my...
2 recycle equipment
3 internet security suite download internet secu...
4 att sent me a text saying that you are giving ...
...
8371 reset voicemail password set up my cellphone v...
8372 need to pay bill
8373 talk to a rep
8374 is there fiber at my house is there fiber at m...
8375 my phone bill
Name: search_terms, Length: 8376, dtype: object
## Search Terms that led to Navigation Difficulties
navigation_success = df.loc[df['Navigation_Difficulties'] == 'Yes']
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(str(navigation_success['search_terms']))
plt.figure( figsize=(8,6) )
plt.imshow(wordcloud)
plt.title('Frequent words in our search_terms')
plt.axis('off')
plt.show()
from collections import Counter
p = Counter(navigation_success['search_terms']).most_common(10)
rslt = pd.DataFrame(p, columns=['Word', 'Frequency'])
print(rslt)
Word Frequency 0 appointment 1 1 signature program enrollment, fan 5339885 1 2 discount 1 3 email login 1 4 connectivity program, connectivity program 1 5 i wanna change my wi-fi password, i wanna c... 1 6 contact information, contact information, ... 1 7 employee and retiree discount 1 8 receive free at&t cell booster because of up... 1 9 anti virus 1
### Error codes
from collections import Counter
p = Counter(navigation_success['ERROR_CODES']).most_common(10)
rslt = pd.DataFrame(p, columns=['Error code', 'Frequency'])
print(rslt)
Error code Frequency 0 0,-2 3 1 -2 2 2 -2,1 2 3 1,-2 2 4 0,1 1 5 205.4,205.2,MSFIDACL08,MSFIDELAG00 1 6 1 1 7 205.4 1 8 902,201,-2,1 1 9 902,205.4,-2 1
##Search Terms that led to failure in Task Accomplishment
TA_No = df.loc[df['TA'] == 'No']
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(str(TA_No['search_terms']))
p = Counter(TA_No['search_terms'], ).most_common(10)
rslt = pd.DataFrame(p, columns=['Word', 'Frequency'])
print(rslt)
Word Frequency 0 appointment 1 1 signature program enrollment, fan 5339885 1 2 discount 1 3 email login 1 4 how do i print my statement 1 5 feedback, feedback, complaints, feedback,... 1 6 i wanna change my wi-fi password, i wanna c... 1 7 contact information, contact information, ... 1 8 receive free at&t cell booster because of up... 1 9 anti virus 1
### Error codes for Navigation
from collections import Counter
p = Counter(navigation_success['ERROR_CODES']).most_common(10)
rslt = pd.DataFrame(p, columns=['Error code', 'Frequency'])
print(rslt)
Error code Frequency 0 0,-2 3 1 -2 2 2 -2,1 2 3 1,-2 2 4 0,1 1 5 205.4,205.2,MSFIDACL08,MSFIDELAG00 1 6 1 1 7 205.4 1 8 902,201,-2,1 1 9 902,205.4,-2 1
### Error codes for Task Accomplishment
p = Counter(TA_No['ERROR_CODES']).most_common(10)
rslt = pd.DataFrame(p, columns=['Error code', 'Frequency'])
print(rslt)
Error code Frequency 0 0,-2 3 1 -2 2 2 0 2 3 1,-2 2 4 0,1 1 5 205.4,205.2,MSFIDACL08,MSFIDELAG00 1 6 0,LGN_02_07,-2 1 7 1 1 8 205.4 1 9 902,201,-2,1 1
navigation_failure = df.loc[df['Navigation_Difficulties'] == 'No']
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(str(navigation_failure['search_terms']))
plt.figure( figsize=(8,6) )
plt.imshow(wordcloud)
plt.title('Frequent words in our search_terms')
plt.axis('off')
plt.show()
from collections import Counter
p = Counter(navigation_failure['search_terms']).most_common(10)
rslt = pd.DataFrame(p, columns=['Word', 'Frequency'])
print(rslt)
Word Frequency 0 how do i print my statement 1 1 reward card 1 2 feedback, feedback, complaints, feedback,... 1 3 unlock phone 1 4 unlimi 1 5 report text spam 1 6 transfer number to another carrier, unlock 1 7 suspend 1 8 international 1 9 chat 1
### Error codes
p = Counter(navigation_failure['ERROR_CODES']).most_common(10)
rslt = pd.DataFrame(p, columns=['Error code', 'Frequency'])
print(rslt)
Error code Frequency 0 0 2 1 -2,1 2 2 1,-2 2 3 201,-2 1 4 0,LGN_02_07,-2 1 5 -2,1,201 1 6 0,201,1 1 7 0,1,201,-2 1 8 -2 1 9 -2,201 1
##Search Terms that led to Task Accomplishment
TA_Yes = df.loc[df['TA'] == 'Yes']
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=1200,
height=1000).generate(str(TA_Yes['search_terms']))
p = Counter(TA_Yes['search_terms']).most_common(10)
rslt = pd.DataFrame(p, columns=['Search Terms', 'Frequency'])
print(rslt)
Search Terms Frequency 0 connectivity program, connectivity program 1 1 reward card 1 2 unlock phone 1 3 unlimi 1 4 employee and retiree discount 1 5 report text spam 1 6 transfer number to another carrier, unlock 1 7 suspend 1 8 international 1 9 voice mail 1
### Error codes
p = Counter(TA_Yes['ERROR_CODES']).most_common(10)
rslt = pd.DataFrame(p, columns=['Error code', 'Frequency'])
print(rslt)
Error code Frequency 0 -2,1 3 1 1,-2 2 2 -2 1 3 201,-2 1 4 -2,1,201 1 5 0 1 6 0,201,1 1 7 0,1,201,-2 1
Inference: The best four features selected through forward feature selection are the almost same features we chose from a set of features, except searches having a good correlation with the target variable.
# Initialize the dataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=1)
knn = KNeighborsClassifier(n_neighbors=3)
# Select the "best" three features via
# 5-fold cross-validation on the training set.
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
fs1 = EFS(knn,
min_features=1,
max_features=4,
scoring='accuracy',
cv=5)
fs1 = fs1.fit(X_train, y_train)
Features: 15/15
print('Selected features:', fs1.best_idx_)
Selected features: (2, 3)
# Generate the new subsets based on the selected features
# Note that the transform call is equivalent to
# X_train[:, fs1.k_feature_idx_]
X_train_fs = fs1.transform(X_train)
X_test_fs = fs1.transform(X_test)
# Fit the estimator using the new feature subset
# and make a prediction on the test data
knn.fit(X_train_fs, y_train)
y_pred = knn.predict(X_test_fs)
# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))
Test set accuracy: 96.00 %
y_pred_tr = knn.predict(X_train_fs)
print("Accuracy score for training data in knn Model", accuracy_score(y_train, y_pred_tr, normalize=True))
print("Accuracy score for test data in knn Model", accuracy_score(y_test, y_pred, normalize=True))
Accuracy score for training data in knn Model 0.96 Accuracy score for test data in knn Model 0.96
df_result = pd.read_csv("result.csv")
df = pd.read_csv("Navigation_Dataset.csv")
## Discard values of each feature that are 2x standard deviation away from the mean values
for feature in df.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']):
df = df[np.abs(df[feature]-df[feature].mean()) <= (2*df[feature].std())]
df_final = pd.merge(left=df, right=df_result, left_on='foresee_session_id', right_on='FORESEE_SESSION')
df_final
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | FORESEE_SESSION | APP_SESSION_ID | EVENT_DATE | EVENT_DATETIME | PAGE_URL | PAGE_GROUP | EVENT_CODE | R | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GUhYhGecMg3xfNbf0mJ9j97Dybjj2Lto | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 7.418337e+09 | 11/26/21 | Support | Wireless | Mobile Web | Troubleshooting - fix a problem with my device... | 0.0 | No | 1 | No | 1 | 0 | 98674899-e129-492a-8d86-2c2af6024a82 | 46.0 | 13.0 | 13.0 | 10.0 | /support/topic/wireless, /buy/phones/details, ... | 0.0 | 7.0 | 2.0 | 17.5 | 0.0 | 295.0 | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 98674899-e129-492a-8d86-2c2af6024a82 | 26/11/2021 | 38:56.6 | /support/ | NaN | Link_Click | 1 |
| 1 | GUhYhGecMg3xfNbf0mJ9j97Dybjj2Lto | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 7.418337e+09 | 11/26/21 | Support | Wireless | Mobile Web | Troubleshooting - fix a problem with my device... | 0.0 | No | 1 | No | 1 | 0 | 98674899-e129-492a-8d86-2c2af6024a82 | 46.0 | 13.0 | 13.0 | 10.0 | /support/topic/wireless, /buy/phones/details, ... | 0.0 | 7.0 | 2.0 | 17.5 | 0.0 | 295.0 | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 98674899-e129-492a-8d86-2c2af6024a82 | 26/11/2021 | 38:58.3 | /support/ | NaN | Link_Click | 2 |
| 2 | GUhYhGecMg3xfNbf0mJ9j97Dybjj2Lto | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 7.418337e+09 | 11/26/21 | Support | Wireless | Mobile Web | Troubleshooting - fix a problem with my device... | 0.0 | No | 1 | No | 1 | 0 | 98674899-e129-492a-8d86-2c2af6024a82 | 46.0 | 13.0 | 13.0 | 10.0 | /support/topic/wireless, /buy/phones/details, ... | 0.0 | 7.0 | 2.0 | 17.5 | 0.0 | 295.0 | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 98674899-e129-492a-8d86-2c2af6024a82 | 26/11/2021 | 39:08.4 | /support/ | NaN | Link_Click | 3 |
| 3 | GUhYhGecMg3xfNbf0mJ9j97Dybjj2Lto | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 7.418337e+09 | 11/26/21 | Support | Wireless | Mobile Web | Troubleshooting - fix a problem with my device... | 0.0 | No | 1 | No | 1 | 0 | 98674899-e129-492a-8d86-2c2af6024a82 | 46.0 | 13.0 | 13.0 | 10.0 | /support/topic/wireless, /buy/phones/details, ... | 0.0 | 7.0 | 2.0 | 17.5 | 0.0 | 295.0 | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 98674899-e129-492a-8d86-2c2af6024a82 | 26/11/2021 | 39:14.3 | /support/topic/wireless | SUPPORT (Not Product-Specific) | Foresee_Feedback_Invite_System_Display | 4 |
| 4 | GUhYhGecMg3xfNbf0mJ9j97Dybjj2Lto | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 7.418337e+09 | 11/26/21 | Support | Wireless | Mobile Web | Troubleshooting - fix a problem with my device... | 0.0 | No | 1 | No | 1 | 0 | 98674899-e129-492a-8d86-2c2af6024a82 | 46.0 | 13.0 | 13.0 | 10.0 | /support/topic/wireless, /buy/phones/details, ... | 0.0 | 7.0 | 2.0 | 17.5 | 0.0 | 295.0 | 019bab3a-e4fc-48e8-b54a-d3c241aa8465 | 98674899-e129-492a-8d86-2c2af6024a82 | 26/11/2021 | 39:21.9 | /support/topic/wireless | SUPPORT (Not Product-Specific) | Link_Click | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6519 | gQcYgcARZAwQB1FlUYcwxw4C | 4f9bc5f4-b536-440a-aed5-60c476f0219b | 7.723804e+09 | 2/3/22 | Support | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Troubleshooting - fix a problem with my device... | 100.0 | No | 4 | No | 7 | 0 | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 49.0 | 10.0 | 17.0 | 11.0 | /, /acctmgmt/billandpay, /support/, /support/t... | 0.0 | 2.0 | 4.0 | 18.5 | 0.0 | 861.0 | 4f9bc5f4-b536-440a-aed5-60c476f0219b | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 03/02/2022 | 37:37.0 | /acctmgmt/accountoverview | ONLINE CARE (Not Product-Specific) | Link_Click | 12 |
| 6520 | gQcYgcARZAwQB1FlUYcwxw4C | 4f9bc5f4-b536-440a-aed5-60c476f0219b | 7.723804e+09 | 2/3/22 | Support | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Troubleshooting - fix a problem with my device... | 100.0 | No | 4 | No | 7 | 0 | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 49.0 | 10.0 | 17.0 | 11.0 | /, /acctmgmt/billandpay, /support/, /support/t... | 0.0 | 2.0 | 4.0 | 18.5 | 0.0 | 861.0 | 4f9bc5f4-b536-440a-aed5-60c476f0219b | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 03/02/2022 | 37:41.0 | /acctmgmt/accountoverview | ONLINE CARE (Not Product-Specific) | Link_Click | 13 |
| 6521 | gQcYgcARZAwQB1FlUYcwxw4C | 4f9bc5f4-b536-440a-aed5-60c476f0219b | 7.723804e+09 | 2/3/22 | Support | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Troubleshooting - fix a problem with my device... | 100.0 | No | 4 | No | 7 | 0 | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 49.0 | 10.0 | 17.0 | 11.0 | /, /acctmgmt/billandpay, /support/, /support/t... | 0.0 | 2.0 | 4.0 | 18.5 | 0.0 | 861.0 | 4f9bc5f4-b536-440a-aed5-60c476f0219b | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 03/02/2022 | 38:00.0 | /acctmgmt/accountoverview | ONLINE CARE (Not Product-Specific) | Link_Click | 14 |
| 6522 | gQcYgcARZAwQB1FlUYcwxw4C | 4f9bc5f4-b536-440a-aed5-60c476f0219b | 7.723804e+09 | 2/3/22 | Support | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Troubleshooting - fix a problem with my device... | 100.0 | No | 4 | No | 7 | 0 | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 49.0 | 10.0 | 17.0 | 11.0 | /, /acctmgmt/billandpay, /support/, /support/t... | 0.0 | 2.0 | 4.0 | 18.5 | 0.0 | 861.0 | 4f9bc5f4-b536-440a-aed5-60c476f0219b | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 03/02/2022 | 38:02.0 | /acctmgmt/billandpay | ONLINE CARE (Not Product-Specific) | Page_Load | 15 |
| 6523 | gQcYgcARZAwQB1FlUYcwxw4C | 4f9bc5f4-b536-440a-aed5-60c476f0219b | 7.723804e+09 | 2/3/22 | Support | Other services from AT&T (Home phone, DSL, etc.) | Desktop | Troubleshooting - fix a problem with my device... | 100.0 | No | 4 | No | 7 | 0 | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 49.0 | 10.0 | 17.0 | 11.0 | /, /acctmgmt/billandpay, /support/, /support/t... | 0.0 | 2.0 | 4.0 | 18.5 | 0.0 | 861.0 | 4f9bc5f4-b536-440a-aed5-60c476f0219b | HXh0ZV8dxwVMeLBiPcjpa33OUjRUZE1mhyMv_TsoNW4 | 03/02/2022 | 38:20.0 | /acctmgmt/billandpay | ONLINE CARE (Not Product-Specific) | Link_Click | 16 |
6524 rows × 34 columns
df_final.PAGE_URL.value_counts(ascending = False)[:10].plot(kind = 'barh')
plt.title("Majority of things customers were doing before they clicked on search")
Text(0.5, 1.0, 'Majority of things customers were doing before they clicked on search')
df_final = df_final.loc[df_final['Searches'] != 0.0]
df_final.Searches.value_counts(ascending = True).plot(kind = 'barh')
<AxesSubplot:>
df_final = df_final.dropna()
df_final = df_final[~df_final['PAGE_URL'].str.contains('/support/')]
nd_pu = df_final.groupby(['Navigation_Difficulties','PAGE_URL']).size().reset_index(name='count').\
sort_values(by = 'count', ascending = False).reset_index().drop(['index'], axis = 1)[:40]
## Show top 50 results
plt.figure(figsize=(10,10))
sns.barplot(y = 'PAGE_URL', x = "count", data = nd_pu, hue = 'Navigation_Difficulties')
<AxesSubplot:xlabel='count', ylabel='PAGE_URL'>
TA_PU = df_final.groupby(['TA','PAGE_URL']).size().reset_index(name='count')\
.sort_values(by = 'count', ascending = False).reset_index().drop(['index'], axis = 1)[:40]
## Show top 50 results
plt.figure(figsize=(10,10))
sns.barplot(y = 'PAGE_URL', x = "count", data = TA_PU, hue = 'TA')
<AxesSubplot:xlabel='count', ylabel='PAGE_URL'>
selected_features_5 = df_final[['event_count', 'page_loads','Searches', "Navigation_Difficulties",
'Page Events', 'Page Views','Average Page Depth', 'Reloads',
'Time Spent per Visit (seconds)']].dropna()
### Set survey category as target variable
X = selected_features_5.drop(['Searches'], axis = 1)
y = selected_features_5['Searches']
#Split the dataset with 80% data for training and 20% for testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
from sklearn.neighbors import KNeighborsClassifier
# define the model
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
# summarize feature importance
X = selected_features_5.drop(['Searches'], axis = 1)
y = selected_features_5['Searches']
knn = KNeighborsClassifier(n_neighbors=3)
efs1 = EFS(knn,
min_features=4,
max_features=6,
scoring='accuracy',
print_progress=True,
cv=5)
efs1 = efs1.fit(X, y)
print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)
Features: 154/154
Best accuracy score: 0.95
Best subset (indices): (0, 1, 4, 6)
Best subset (corresponding names): ('event_count', 'page_loads', 'Page Views', 'Reloads')
fig=plt.figure(figsize=(15,15))
for e, column in enumerate(['event_count', 'page_loads', 'Page Events', 'Time Spent per Visit (seconds)']):
plt.subplot(4, 3, e + 1)
sns.regplot( data= df_final, x=column, y='Searches', line_kws={"color": "green"})
Inference: The above plot shows that all these 4 features; event_count, page_loads, Page Events and Time Spent per Visit have positive correlation with Number of searches made by the customer.
#df_final[['PAGE_URL']]
df_final[['Path1','Path2', 'Path3', 'Path4', 'Path5']] = df_final['PAGE_URL'].str.split('/', expand=True)
df_final.head()
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | FORESEE_SESSION | APP_SESSION_ID | EVENT_DATE | EVENT_DATETIME | PAGE_URL | PAGE_GROUP | EVENT_CODE | R | Path1 | Path2 | Path3 | Path4 | Path5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 18:36.0 | /wireless/ | SALES (Not Product-Specific) | Link_Click | 1 | wireless | None | None | ||
| 65 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 18:45.0 | /wireless/ | SALES (Not Product-Specific) | Link_Click | 2 | wireless | None | None | ||
| 66 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:32.0 | /acctmgmt/billandpay | ONLINE CARE (Not Product-Specific) | Page_Load | 3 | acctmgmt | billandpay | None | None | |
| 67 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:37.0 | /acctmgmt/billandpay/history | ONLINE CARE (Not Product-Specific) | Page_Load | 4 | acctmgmt | billandpay | history | None | |
| 68 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:41.0 | /acctmgmt/atvbillpayments | ONLINE CARE (Not Product-Specific) | Page_Load | 5 | acctmgmt | atvbillpayments | None | None |
df_final[['Path6', 'Path7']] = df_final['urls'].str.split('/', expand=True).iloc[:, [14,17]]
df_final[['Path2', 'Path3', 'Path4', 'Path5', 'Path6', 'Path7']] = df_final[['Path2', 'Path3', 'Path4', 'Path5','Path6', 'Path7']].replace(to_replace=[None], value=np.nan)
df_final[['Path2', 'Path3', 'Path4', 'Path5','Path6', 'Path7']] = df_final[['Path2', 'Path3', 'Path4', 'Path5','Path6', 'Path7']].replace(to_replace='', value=np.nan)
df_final.head()
| foresee_respondent | foresee_session_id | wt_visitor_id | day | Survey_Category | Product | Platform | main_task | CSAT | TA | Effort | Navigation_Difficulties | Relevance | called_flag | app_session_id | event_count | page_loads | link_clicks | impressions | urls | Searches | Page Events | Page Views | Average Page Depth | Reloads | Time Spent per Visit (seconds) | FORESEE_SESSION | APP_SESSION_ID | EVENT_DATE | EVENT_DATETIME | PAGE_URL | PAGE_GROUP | EVENT_CODE | R | Path1 | Path2 | Path3 | Path4 | Path5 | Path6 | Path7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 18:36.0 | /wireless/ | SALES (Not Product-Specific) | Link_Click | 1 | wireless | NaN | NaN | NaN | search, | u-verse-voice | |
| 65 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 18:45.0 | /wireless/ | SALES (Not Product-Specific) | Link_Click | 2 | wireless | NaN | NaN | NaN | search, | u-verse-voice | |
| 66 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:32.0 | /acctmgmt/billandpay | ONLINE CARE (Not Product-Specific) | Page_Load | 3 | acctmgmt | billandpay | NaN | NaN | search, | u-verse-voice | |
| 67 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:37.0 | /acctmgmt/billandpay/history | ONLINE CARE (Not Product-Specific) | Page_Load | 4 | acctmgmt | billandpay | history | NaN | search, | u-verse-voice | |
| 68 | oJNB91QlQUVsMgtst18Vcw4C | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | 7.679903e+09 | 1/5/22 | Account Management | Wireless | Desktop | View previous bill(s) | 0.0 | No | 2 | Yes | 2 | 0 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 60.0 | 19.0 | 17.0 | 9.0 | /support/contact-us/, /wireless/, , /acctmgmt/... | 6.0 | 16.0 | 5.0 | 26.0 | 2.0 | 1055.0 | 2115de72-a3ce-4aaf-b08f-d3aa476e6024 | fe1YBsBbQrwETnEZCltkd2o2HBFXLuUFz_IoLZYH_58 | 05/01/2022 | 26:41.0 | /acctmgmt/atvbillpayments | ONLINE CARE (Not Product-Specific) | Page_Load | 5 | acctmgmt | atvbillpayments | NaN | NaN | search, | u-verse-voice |
## Fillup missing values in path with mode values
for features in ['Path2', 'Path3', 'Path4', 'Path5','Path6', 'Path7']:
value = df_final[features].mode()[0]
df_final[features].fillna(value, inplace=True)
The following two graphs show the insights of last 4 pages before search.
values = df_final['Navigation_Difficulties'].mode()[0]
df_final['Navigation_Difficulties'].fillna(values, inplace=True)
##
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
fig = px.sunburst(df_final, path=['Path2', 'Path3', 'Path4', 'Path6', 'Path7'],
values='Effort', color='Navigation_Difficulties', width=800, height=800,color_continuous_scale='RdBu',)
fig.update_layout(uniformtext=dict(minsize=18))
fig.show()
### pennsylvania and pittsburgh
df_pe_pi = df_final[df_final['urls'].str.contains('pennsylvania|pittsburgh')]
fig = px.sunburst(df_pe_pi, path=['Path2', 'Path3', 'Path4', 'Path6', 'Path7'],
values='Searches', color='Navigation_Difficulties', width=860, height=860)
fig.update_layout(uniformtext=dict(minsize=18))
fig.show()
### DTV product
df_dtv = df_final[df_final['Product'] == 'DTV']
fig = px.sunburst(df_dtv, path=['Path2', 'Path3', 'Path4', 'Path6', 'Path7'],
values='Searches', color='Navigation_Difficulties', width=860, height=860)
fig.update_layout(uniformtext=dict(minsize=18))
fig.show()